% Step 1. Open data
%- open csv 
%- unpack data
% Step 2. Prepare data 
%- assign donations to integer bins
%- keep donations <=300
% Step 3. Identify probability mass function from donations under AD
% treatment
% Step 4. Save data for estimation
% save variables in data structure

clc;
clear all;

%% Step 1 open data
data=csvread('data_section4.csv',1,0);


default=data(:,1);
donation=data(:,2);


total_obs=length(donation);
%% Step 2 Preparation 
% set donations between 0 and 1 to 1 (so that after rounding all 0's are nondonations)
donation(donation<1 & donation>0)=1;
%% Integer bins
% round donations to match integer bins
donation=round(donation);
%% Keep obs <= 300
% keep donations smaller or equal to 300
subsample= (donation<=300);
donation=donation(subsample);
default=default(subsample);

%share of obs used:
share_obs= length(donation)/total_obs;
%% Step 3 Relative frequence with varying binwidth
% to account for discrete nature of data
% relative frequency with increasing bandwidth from donation 0 to 300
% round donation to nearest integer value
donationc=donation(default==0);
[relfreq relcount count bandwidth total]= relfreq_varyingbin(donationc);
%% Pack in data structure

d.relfreq=relfreq
d.donation= donation
d.default= default
% save final dataset and structure
save('section4_all.mat', 'd');

%% excluding some of the zero contributions
% to select a share of the sample, e.g. add zero constributions up to 3.5% of sample
% set rate_sample to share of sample that you want to keep
% for 100% of sample: rate_sample =1
% to add zeros up to 3.5% of sample: sample_rate=0.035 etc.
clc;
clear all;
%% Step 1 open data
data=csvread('data_section4.csv',1,0);

default=data(:,1);
donation=data(:,2);
%%
rate_sample= 0.035

nzeros_ad= round(length(donation(default==0))*rate_sample - length(donation(donation>0 & default==0)))
nzeros_10= round(length(donation(default==10))*rate_sample - length(donation(donation>0 & default==10)))
nzeros_20= round(length(donation(default==20))*rate_sample - length(donation(donation>0 & default==20)))
nzeros_50= round(length(donation(default==50))*rate_sample - length(donation(donation>0 & default==50)))

%% Step 2
% round donations to match integer bins
donation=round(donation);
% keep donations smaller or equal to 300
subsample= (donation<=300 & donation>0);
donation=donation(subsample);
default=default(subsample);


%% add zero contributors

add_don_AD=zeros(nzeros_ad,1);
donation=vertcat(donation, add_don_AD);
add_def_AD= zeros(nzeros_ad,1);
default=vertcat(default, add_def_AD);

add_don_10= zeros(nzeros_10,1);
donation=vertcat(donation, add_don_10);
add_def_10=ones(nzeros_10,1).*10;
default=vertcat(default, add_def_10);

add_don_20= zeros(nzeros_20,1);
donation=vertcat(donation, add_don_20);
add_def_20= ones(nzeros_20,1).*20;
default=vertcat(default, add_def_20);

add_don_50= zeros(nzeros_50,1);
donation=vertcat(donation, add_don_50);
add_def_50= ones(nzeros_50,1).*50;
default=vertcat(default, add_def_50);


%% Step 3. Relative frequence with varying binwidth
% to account for discrete nature of data
% relative frequency with increasing bandwidth from donation 0 to 300
% round donation to nearest integer value
donationc=donation(default==0);
[relfreq relcount count bandwidth total]= relfreq_varyingbin(donationc);

%% Step 4

d.relfreq=relfreq
d.donation= donation
d.default= default
% save final dataset for section 4 and structure
% save section4_XX with XX as percentage of observations added; all, 3.5 percent
save('section4_35.mat', 'd');
%}
 